Figure 02a - Panel C/D/E
1) Setup
- Defining work directory
In this section, we define the working directory for the R Markdown document.
# Get the path of the current script
# Then get the parent directory of the parent directory of the parent directory
local_wd_folder <- dirname(dirname(dirname(rstudioapi::getSourceEditorContext()$path)))
# Set the root directory for knitr to the local working directory
knitr::opts_knit$set(root.dir = local_wd_folder)- Defining input data and output directories
Here, we define the directories for input data and output files.
- Setting seed
Setting a seed ensures that any random processes are reproducible.
- Packages installation (optional)
In this section, we ensure that all necessary packages are installed.
# Ensure BiocManager is available for installation of Bioconductor packages
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::install(version = "3.18")## Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.3 (2024-02-29)
## Old packages: 'knitr', 'uwot'
# Define a list of required packages used in this script
packages_required <- c("ComplexHeatmap", "stringr",
"unikn", "RColorBrewer", "yarrr",
"scales", "ggsci", "R.utils", "GenomicFeatures",
"tximport", "DESeq2", "tidyverse", "plotly", "dplyr",
"ashr", "apeglm", "EnhancedVolcano", "IHW", "circlize", "Matrix")
# Identify any required packages that are not installed
packages_uninstalled <- packages_required[!(packages_required %in% installed.packages()[,"Package"])]
# Install any uninstalled packages
if(length(packages_uninstalled)) BiocManager::install(packages_uninstalled)- Loading packages
Here, we load the necessary packages for our analysis.
# Load stringr for string manipulation
library(stringr, quietly = TRUE)
library(reticulate)
# Load ComplexHeatmap for creating complex heatmaps
library(ComplexHeatmap, quietly = TRUE)
library(EnhancedVolcano)
library(GenomicFeatures)
library(circlize)
library(tidyverse)
library(R.utils)
library(tximport)
library(DESeq2)
library(dplyr)
library(IHW)
library(ashr)
library(apeglm)
library(plotly)- Loading palettes
In this section, we load additional color palettes and define some custom ones.
# Load additional colour palette packages
library(unikn, quietly = TRUE)
library(RColorBrewer, quietly = TRUE)
library(yarrr, quietly = TRUE)
library(scales, quietly = TRUE)
library(ggsci, quietly = TRUE)
# Define a set of custom color palettes from the unikn package
mix_1 <- usecol(pal = c(Karpfenblau, "white", Peach), n = 15)
mix_2 <- usecol(pal = c(rev(pal_seeblau), "white", pal_pinky))
mix_3 <- usecol(pal = c(rev(pal_bordeaux), "white", pal_petrol), n = 15)
# Display the custom palettes
seecol(list(mix_1, mix_2, mix_3), col_brd = "white", lwd_brd = 4, title = "Comparing palettes mixed from unikn colors", pal_names = c("mix_1", "mix_2", "mix_3"))# Define a second set of custom palettes from the RColorBrewer and yarrr packages
brew_mix <- usecol(c(rev(brewer.pal(n = 4, name = "Reds")), "white", brewer.pal(n = 4, name = "Blues")), n = 13)
brew_ext <- usecol(brewer.pal(n = 11, name = "Spectral"), n = 12)
yarrr_mix <- usecol(c(piratepal("nemo"), piratepal("bugs")))
yarrr_mod <- usecol(c(piratepal("ipod")), n = 9)
# Display the second set of custom palettes
seecol(pal = list(brew_mix, brew_ext, yarrr_mix, yarrr_mod), col_brd = "white", lwd_brd = 2, title = "Using usecol() and seecol() to mix and modify palettes", pal_names = c("brew_mix", "brew_ext", "yarrr_mix", "yarrr_mod"))- Log Session Info
Finally, we log the session information for reproducibility.
# Write the session information to a text file
writeLines(capture.output(sessionInfo()), file.path(script_folder, 'Panel_A_SessionInfo.txt'))
# Print the session information
sessionInfo()## R version 4.3.3 (2024-02-29)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.3.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/London
## tzcode source: internal
##
## attached base packages:
## [1] stats4 grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggsci_3.0.3 scales_1.3.0
## [3] yarrr_0.1.5 BayesFactor_0.9.12-4.7
## [5] Matrix_1.6-5 coda_0.19-4.1
## [7] jpeg_0.1-10 RColorBrewer_1.1-3
## [9] unikn_1.0.0 plotly_4.10.4
## [11] apeglm_1.24.0 ashr_2.2-63
## [13] IHW_1.30.0 DESeq2_1.42.1
## [15] SummarizedExperiment_1.32.0 MatrixGenerics_1.14.0
## [17] matrixStats_1.3.0 tximport_1.30.0
## [19] R.utils_2.12.3 R.oo_1.26.0
## [21] R.methodsS3_1.8.2 lubridate_1.9.3
## [23] forcats_1.0.0 dplyr_1.1.4
## [25] purrr_1.0.2 readr_2.1.5
## [27] tidyr_1.3.1 tibble_3.2.1
## [29] tidyverse_2.0.0 circlize_0.4.16
## [31] GenomicFeatures_1.54.4 AnnotationDbi_1.64.1
## [33] Biobase_2.62.0 GenomicRanges_1.54.1
## [35] GenomeInfoDb_1.38.8 IRanges_2.36.0
## [37] S4Vectors_0.40.2 BiocGenerics_0.48.1
## [39] EnhancedVolcano_1.20.0 ggrepel_0.9.5
## [41] ggplot2_3.5.1 ComplexHeatmap_2.18.0
## [43] reticulate_1.36.1 stringr_1.5.1
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.16.0 jsonlite_1.8.8 shape_1.4.6.1
## [4] magrittr_2.0.3 rmarkdown_2.26 GlobalOptions_0.1.2
## [7] BiocIO_1.12.0 zlibbioc_1.48.2 vctrs_0.6.5
## [10] memoise_2.0.1 Rsamtools_2.18.0 RCurl_1.98-1.14
## [13] SQUAREM_2021.1 mixsqp_0.3-54 htmltools_0.5.8.1
## [16] S4Arrays_1.2.1 progress_1.2.3 curl_5.2.1
## [19] truncnorm_1.0-9 SparseArray_1.2.4 sass_0.4.9
## [22] bslib_0.7.0 htmlwidgets_1.6.4 plyr_1.8.9
## [25] cachem_1.0.8 GenomicAlignments_1.38.2 lifecycle_1.0.4
## [28] iterators_1.0.14 pkgconfig_2.0.3 R6_2.5.1
## [31] fastmap_1.1.1 GenomeInfoDbData_1.2.11 clue_0.3-65
## [34] numDeriv_2016.8-1.1 digest_0.6.35 fdrtool_1.2.17
## [37] colorspace_2.1-0 irlba_2.3.5.1 lpsymphony_1.30.0
## [40] RSQLite_2.3.6 invgamma_1.1 filelock_1.0.3
## [43] fansi_1.0.6 timechange_0.3.0 httr_1.4.7
## [46] abind_1.4-5 compiler_4.3.3 bit64_4.0.5
## [49] withr_3.0.0 doParallel_1.0.17 BiocParallel_1.36.0
## [52] DBI_1.2.2 highr_0.10 biomaRt_2.58.2
## [55] MASS_7.3-60.0.1 rappdirs_0.3.3 DelayedArray_0.28.0
## [58] rjson_0.2.21 tools_4.3.3 glue_1.7.0
## [61] restfulr_0.0.15 cluster_2.1.6 generics_0.1.3
## [64] gtable_0.3.5 tzdb_0.4.0 rmdformats_1.0.4
## [67] data.table_1.15.4 hms_1.1.3 xml2_1.3.6
## [70] utf8_1.2.4 XVector_0.42.0 foreach_1.5.2
## [73] pillar_1.9.0 emdbook_1.3.13 BiocFileCache_2.10.2
## [76] lattice_0.22-6 rtracklayer_1.62.0 bit_4.0.5
## [79] tidyselect_1.2.1 locfit_1.5-9.9 pbapply_1.7-2
## [82] Biostrings_2.70.3 knitr_1.45 bookdown_0.39.1
## [85] xfun_0.43 stringi_1.8.3 lazyeval_0.2.2
## [88] yaml_2.3.8 evaluate_0.23 codetools_0.2-20
## [91] bbmle_1.0.25.1 BiocManager_1.30.22 cli_3.6.2
## [94] munsell_0.5.1 jquerylib_0.1.4 Rcpp_1.0.12
## [97] dbplyr_2.5.0 png_0.1-8 bdsmatrix_1.3-7
## [100] XML_3.99-0.16.1 parallel_4.3.3 MatrixModels_0.5-3
## [103] blob_1.2.4 prettyunits_1.2.0 bitops_1.0-7
## [106] viridisLite_0.4.2 mvtnorm_1.2-4 slam_0.1-50
## [109] crayon_1.5.2 GetoptLong_1.0.5 rlang_1.1.3
## [112] KEGGREST_1.42.0
2) Loading input files
- Loading gene annotation files
We use the following code to download a gene annotation file, convert it into a SQLite database for genomic analysis, or load the existing database if it’s already present.
- Load check or creation of sqlite file
# Define the path to the SQLite database
txdb.filename <- paste(data_folder, "/RNASeq/Annotation_files/GENCODE_v42.annotation.sqlite", sep = '')
# Check if the SQLite database already exists
if (file.exists(txdb.filename)) {
# Print a message indicating that the database already exists and is being loaded
print(sprintf('File %s already exist! Now Loading...',txdb.filename))
# Load the SQLite database
txdb <- loadDb(txdb.filename)
} else {
# Define the URL of the GTF file to download
url <- "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/gencode.v42.annotation.gtf.gz"
dir.create(paste(data_folder, "/RNASeq/Annotation_files", sep = ''),
showWarnings = F)
# Define the local path where the GTF file will be saved
destfile <- paste(data_folder, "/RNASeq/Annotation_files/GENCODE_v42.annotation.gtf.gz", sep = '')
# Download the GTF file
download.file(url, destfile)
# Define the path of the decompressed GTF file
gtf <- paste(data_folder, "/RNASeq/Annotation_files/GENCODE_v42.annotation.gtf", sep = '')
# If the decompressed GTF file already exists, remove it
if (file.exists(gtf)) {
file.remove(gtf)
}
# Decompress the GTF file
gunzip(destfile)
# Create a TxDb object from the GTF file
txdb <- makeTxDbFromGFF(gtf)
# Save the TxDb object as a SQLite database for later use
saveDb(txdb, txdb.filename)
# Load the SQLite database
txdb <- loadDb(txdb.filename)
# Print a message indicating that the SQLite database has been generated
print(sprintf('File %s has been generated!',txdb.filename))
}## [1] "File ./Data/RNASeq/Annotation_files/GENCODE_v42.annotation.sqlite already exist! Now Loading..."
- Load check or creation of csv file
We use the following code to load gene metadata from a CSV file, or if it’s not present, to extract it from a GTF file and create the CSV.
# Define the path to the gene metadata CSV file
gencode_gene_info_df.filename <- paste(data_folder, "/RNASeq/Annotation_files/GENCODE_v42.gene_metadata.csv", sep = '')
# Check if the CSV file already exists
if (file.exists(gencode_gene_info_df.filename)) {
# Print a message indicating that the CSV file already exists
print(sprintf('File %s already exist! Now Loading...',gencode_gene_info_df.filename))
# Load the CSV file into a dataframe
Genes_annotation_metadata <- read.csv(paste(data_folder, '/RNASeq/Annotation_files/GENCODE_v42.gene_metadata.csv', sep = ''), header = TRUE)
# Set the row names of the dataframe to the gene IDs
rownames(Genes_annotation_metadata) <- Genes_annotation_metadata$gene_id
} else {
# Define the path to the GTF file and the SQLite database
gtf <- paste(data_folder, "/RNASeq/Annotation_files/GENCODE_v42.annotation.gtf", sep = '')
txdb.filename <- paste(data_folder, "/RNASeq/Annotation_files/GENCODE_v42.annotation.sqlite", sep = '')
# Load the SQLite database
txdb <- loadDb(txdb.filename)
# Load the GTF file into a dataframe
gtf_df <- read.table(file = gtf, sep="\t", header=FALSE)
# Filter the dataframe to include only rows where the third column is 'gene'
gtf_df_genes <- gtf_df %>% dplyr::filter(V3 == 'gene')
# Create an empty dataframe to store the gene metadata
gencode_gene_info_df <- data.frame()
# Create a progress bar
pb <- txtProgressBar(min = 0, max = nrow(gtf_df_genes), style = 3)
# Iterate over the rows of the filtered dataframe
for (gene_index in 1:nrow(gtf_df_genes)) {
# Update the progress bar
setTxtProgressBar(pb, gene_index)
# Extract the gene information from the ninth column of the current row
gene_info <- gsub(";\\s+", ";", gtf_df_genes[gene_index,'V9'])
# Split the gene information into items
items <- strsplit(gene_info, ";")[[1]]
# Iterate over the items and split each one on the space
for (item in items) {
parts <- unlist(strsplit(item, " "))
# If there are exactly two parts, add them to the dataframe as a new column
if (length(parts) == 2) {
gencode_gene_info_df[unlist(strsplit(items[[1]], " "))[[2]],sprintf('%s',parts[1])] <- parts[2]
}
}
}
# Close the progress bar
close(pb)
# Write the dataframe to a CSV file
write.csv(x = gencode_gene_info_df, file = gencode_gene_info_df.filename, row.names = F)
# Print a message indicating that the CSV file has been generated
print(sprintf('File %s has been generated! Now loading...',gencode_gene_info_df.filename))
# Load the CSV file into a dataframe
Genes_annotation_metadata <- read.csv(paste(data_folder, '/RNASeq/Annotation_files/GENCODE_v42.gene_metadata.csv', sep = ''), header = TRUE)
# Set the row names of the dataframe to the gene IDs
rownames(Genes_annotation_metadata) <- Genes_annotation_metadata$gene_id
}## [1] "File ./Data/RNASeq/Annotation_files/GENCODE_v42.gene_metadata.csv already exist! Now Loading..."
- Loading sample metadata files
3) [OPTIONAL]: Importing and grouping raw counts
Only relevant if we start from the raw counts
- Loading counts path
- Gene-level summarization via tximport
- Exporting gene-level counts and TPMs
4) DESeq2 Analysis
- Loading read and TPM matrices
- Filtering non-unique uninformative Ensembl.id
library(DESeq2)
dds = DESeqDataSetFromTximport(txi = TXI.Genes,
colData = Samples_metadata,
rowData= Genes_annotation_metadata[rownames(TXI.Genes$counts),],
design = ~ Condition)## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using counts and average transcript lengths from tximport
The following filtering step removes many non-informative/duplicated genes
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using 'avgTxLength' from assays(dds), correcting for library size
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 3085 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
Few non-unique genes will remain. We filter out duplicates with low baseMean.
##
## FALSE TRUE
## 326 26777
## DataFrame with 326 rows and 31 columns
## gene_id gene_type
## <character> <character>
## ENSG00000002586.20 ENSG00000002586.20 protein_coding
## ENSG00000002586.20_PAR_Y ENSG00000002586.20_P.. protein_coding
## ENSG00000015479.20 ENSG00000015479.20 protein_coding
## ENSG00000067601.11 ENSG00000067601.11 transcribed_unproces..
## ENSG00000080947.16 ENSG00000080947.16 transcribed_unproces..
## ... ... ...
## ENSG00000291192.1 ENSG00000291192.1 lncRNA
## ENSG00000291223.1 ENSG00000291223.1 lncRNA
## ENSG00000291263.1 ENSG00000291263.1 lncRNA
## ENSG00000291266.1 ENSG00000291266.1 lncRNA
## ENSG00000291280.1 ENSG00000291280.1 lncRNA
## gene_name level tag hgnc_id
## <character> <integer> <character> <character>
## ENSG00000002586.20 CD99 2 NA HGNC:7082
## ENSG00000002586.20_PAR_Y CD99 2 PAR HGNC:7082
## ENSG00000015479.20 MATR3 1 overlapping_locus HGNC:6912
## ENSG00000067601.11 PMS2P4 2 NA HGNC:9129
## ENSG00000080947.16 CROCCP3 2 NA HGNC:29405
## ... ... ... ... ...
## ENSG00000291192.1 ANKRD18DP 2 overlaps_pseudogene HGNC:28016
## ENSG00000291223.1 FAM86DP 2 overlaps_pseudogene HGNC:32659
## ENSG00000291263.1 SMG1P7 2 overlaps_pseudogene NA
## ENSG00000291266.1 SMG1P5 2 overlaps_pseudogene NA
## ENSG00000291280.1 ANKRD20A11P 2 overlaps_pseudogene NA
## havana_gene artif_dupl baseMean
## <character> <character> <numeric>
## ENSG00000002586.20 OTTHUMG00000021073.12 NA 4511.64036
## ENSG00000002586.20_PAR_Y OTTHUMG00000021073.12 NA 4528.92115
## ENSG00000015479.20 OTTHUMG00000129229.7 NA 5829.26679
## ENSG00000067601.11 OTTHUMG00000156923.3 NA 1.06017
## ENSG00000080947.16 OTTHUMG00000037885.4 NA 30.82237
## ... ... ... ...
## ENSG00000291192.1 NA NA 4.9106
## ENSG00000291223.1 NA NA 163.7271
## ENSG00000291263.1 NA NA 97.4232
## ENSG00000291266.1 NA NA 333.7816
## ENSG00000291280.1 NA NA 10.6116
## baseVar allZero dispGeneEst dispGeneIter
## <numeric> <logical> <numeric> <numeric>
## ENSG00000002586.20 10555974.30 FALSE 0.4225708 9
## ENSG00000002586.20_PAR_Y 11701925.40 FALSE 0.4540431 10
## ENSG00000015479.20 2211028.62 FALSE 0.0648926 13
## ENSG00000067601.11 16.77 FALSE 49.0000000 9
## ENSG00000080947.16 1561.80 FALSE 4.7090224 9
## ... ... ... ... ...
## ENSG00000291192.1 111.105 FALSE 12.181512 13
## ENSG00000291223.1 8557.502 FALSE 0.412742 14
## ENSG00000291263.1 4090.590 FALSE 0.471501 14
## ENSG00000291266.1 32591.028 FALSE 0.447433 12
## ENSG00000291280.1 396.229 FALSE 6.365591 14
## dispFit dispersion dispIter dispOutlier dispMAP
## <numeric> <numeric> <integer> <logical> <numeric>
## ENSG00000002586.20 0.366169 0.426228 10 FALSE 0.426228
## ENSG00000002586.20_PAR_Y 0.366102 0.466530 11 FALSE 0.466530
## ENSG00000015479.20 0.362155 0.075931 14 FALSE 0.075931
## ENSG00000067601.11 75.924728 49.000000 10 FALSE 49.000000
## ENSG00000080947.16 2.947947 4.532790 11 FALSE 4.532790
## ... ... ... ... ... ...
## ENSG00000291192.1 16.664925 12.779288 10 FALSE 12.779288
## ENSG00000291223.1 0.837785 0.432863 13 FALSE 0.432863
## ENSG00000291263.1 1.170841 0.500636 10 FALSE 0.500636
## ENSG00000291266.1 0.588459 0.455304 11 FALSE 0.455304
## ENSG00000291280.1 7.898978 6.494157 8 FALSE 6.494157
## Intercept Condition_BP.CMML_vs_Control SE_Intercept
## <numeric> <numeric> <numeric>
## ENSG00000002586.20 11.099672 1.158301 0.356145
## ENSG00000002586.20_PAR_Y 11.034931 1.233652 0.372600
## ENSG00000015479.20 12.605102 -0.112644 0.150382
## ENSG00000067601.11 -0.708441 0.891925 3.867702
## ENSG00000080947.16 5.177179 -0.272903 1.163871
## ... ... ... ...
## ENSG00000291192.1 2.92352 -0.781207 1.963053
## ENSG00000291223.1 7.60633 -0.303185 0.360405
## ENSG00000291263.1 6.50084 0.117172 0.388991
## ENSG00000291266.1 7.97354 0.467345 0.369223
## ENSG00000291280.1 3.90858 -0.626191 1.397052
## SE_Condition_BP.CMML_vs_Control
## <numeric>
## ENSG00000002586.20 0.384673
## ENSG00000002586.20_PAR_Y 0.402444
## ENSG00000015479.20 0.162441
## ENSG00000067601.11 4.175538
## ENSG00000080947.16 1.257398
## ... ...
## ENSG00000291192.1 2.121148
## ENSG00000291223.1 0.389521
## ENSG00000291263.1 0.420375
## ENSG00000291266.1 0.398806
## ENSG00000291280.1 1.509887
## WaldStatistic_Intercept
## <numeric>
## ENSG00000002586.20 31.166165
## ENSG00000002586.20_PAR_Y 29.616032
## ENSG00000015479.20 83.820712
## ENSG00000067601.11 -0.183168
## ENSG00000080947.16 4.448242
## ... ...
## ENSG00000291192.1 1.48927
## ENSG00000291223.1 21.10493
## ENSG00000291263.1 16.71203
## ENSG00000291266.1 21.59545
## ENSG00000291280.1 2.79773
## WaldStatistic_Condition_BP.CMML_vs_Control
## <numeric>
## ENSG00000002586.20 3.011136
## ENSG00000002586.20_PAR_Y 3.065397
## ENSG00000015479.20 -0.693446
## ENSG00000067601.11 0.213607
## ENSG00000080947.16 -0.217038
## ... ...
## ENSG00000291192.1 -0.368294
## ENSG00000291223.1 -0.778353
## ENSG00000291263.1 0.278731
## ENSG00000291266.1 1.171860
## ENSG00000291280.1 -0.414727
## WaldPvalue_Intercept
## <numeric>
## ENSG00000002586.20 3.06352e-213
## ENSG00000002586.20_PAR_Y 9.29058e-193
## ENSG00000015479.20 0.00000e+00
## ENSG00000067601.11 8.54666e-01
## ENSG00000080947.16 8.65758e-06
## ... ...
## ENSG00000291192.1 1.36416e-01
## ENSG00000291223.1 7.16562e-99
## ENSG00000291263.1 1.07120e-62
## ENSG00000291266.1 1.98215e-103
## ENSG00000291280.1 5.14625e-03
## WaldPvalue_Condition_BP.CMML_vs_Control betaConv
## <numeric> <logical>
## ENSG00000002586.20 0.00260272 TRUE
## ENSG00000002586.20_PAR_Y 0.00217381 TRUE
## ENSG00000015479.20 0.48802941 TRUE
## ENSG00000067601.11 0.83085340 TRUE
## ENSG00000080947.16 0.82817887 TRUE
## ... ... ...
## ENSG00000291192.1 0.712654 TRUE
## ENSG00000291223.1 0.436361 TRUE
## ENSG00000291263.1 0.780451 TRUE
## ENSG00000291266.1 0.241253 TRUE
## ENSG00000291280.1 0.678342 TRUE
## betaIter deviance maxCooks replace
## <numeric> <numeric> <logical> <logical>
## ENSG00000002586.20 4 902.3928 NA FALSE
## ENSG00000002586.20_PAR_Y 4 905.4087 NA FALSE
## ENSG00000015479.20 3 853.4472 NA FALSE
## ENSG00000067601.11 5 59.5232 NA FALSE
## ENSG00000080947.16 14 378.9929 NA FALSE
## ... ... ... ... ...
## ENSG00000291192.1 11 175.734 NA FALSE
## ENSG00000291223.1 4 581.103 NA FALSE
## ENSG00000291263.1 4 534.563 NA FALSE
## ENSG00000291266.1 4 652.509 NA FALSE
## ENSG00000291280.1 10 255.859 NA FALSE
df <- as.data.frame(rowData(dds)[grep(FALSE,isUnique(rowData(dds)$gene_name)),])
# Maximum absolute value of Expression by Gene
maxabs <- with(df, aggregate(baseMean, list(gene_name=gene_name), FUN=function(x) max(abs(x))))
# Combine with original data frame
df <- merge(df, maxabs, by="gene_name")
# Get desired rows
df_unique <- subset(df, abs(baseMean) == x)
# Remove duplicate genes (all gene rows should be identical)
df_unique <- df_unique[!duplicated(df_unique$gene_name), ]Now we generate a list of unique genes for the real DESeq2 run
unique_genes <- c(rowData(dds)[grep(TRUE,isUnique(rowData(dds)$gene_name)),]$gene_id,
df_unique$gene_id)
write.csv(x = data.frame(unique_genes), file = paste(data_folder, '/RNASeq/Counts/List_unique_genes.csv', sep = ''), quote = F, row.names = F)
table(isUnique(unique_genes))##
## TRUE
## 26928
Number and proportion of duplicated discarded genes
##
## TRUE
## 151
print((table(isUnique(unique_genes))-table(isUnique(rowData(dds)$gene_name))[[2]])/table(isUnique(unique_genes)))##
## TRUE
## 0.005607546
- Running DESeq2 with unique genes
- Testing parametric dispersion vs local
dds_parametric = DESeqDataSetFromTximport(txi = TXI.Genes,
colData = Samples_metadata,
rowData = Genes_annotation_metadata[rownames(TXI.Genes$counts),],
design = ~ Condition)## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using counts and average transcript lengths from tximport
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using 'avgTxLength' from assays(dds), correcting for library size
dds_parametric <- dds_parametric[unique_genes,]
dds_parametric <- DESeq(dds_parametric, fitType = "parametric")## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 3046 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
Residuals_parametric <- log(mcols(dds_parametric)$dispGeneEst)-log(mcols(dds_parametric)$dispFit)
plotDispEsts(dds_parametric, main= "dispEst: Parametric")Median_Absolute_Residual_parametric <- median(abs(log(mcols(dds_parametric)$dispGeneEst)-log(mcols(dds_parametric)$dispFit)))
dds_parametric$Condition <- relevel(dds_parametric$Condition, ref = "Control")
res_parametric <- results(dds_parametric, contrast=c("Condition", "BP-CMML", "Control"))
dds_local = DESeqDataSetFromTximport(txi = TXI.Genes,
colData = Samples_metadata,
rowData= Genes_annotation_metadata[rownames(TXI.Genes$counts),],
design = ~ Condition)## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using counts and average transcript lengths from tximport
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using 'avgTxLength' from assays(dds), correcting for library size
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 3046 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
Residuals_local <- log(mcols(dds_local)$dispGeneEst)-log(mcols(dds_local)$dispFit)
plotDispEsts(dds_local, main= "dispEst: Local")Median_Absolute_Residual_local <- median(abs(log(mcols(dds_local)$dispGeneEst)-log(mcols(dds_local)$dispFit)))
dds_local$Condition <- relevel(dds_local$Condition, ref = "Control")
res_local <- results(dds_local, contrast=c("Condition", "BP-CMML", "Control"))
if ((Median_Absolute_Residual_parametric < Median_Absolute_Residual_local) == TRUE) {
Best_scoring_fitType <- 'parametric'
} else {
Best_scoring_fitType <- 'local'
}dds = DESeqDataSetFromTximport(txi = TXI.Genes,
colData = Samples_metadata,
rowData= Genes_annotation_metadata[rownames(TXI.Genes$counts),],
design = ~ Condition)## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using counts and average transcript lengths from tximport
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using 'avgTxLength' from assays(dds), correcting for library size
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 3046 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
- Generating VST and RLOG normalised counts
## 55 136 171 246 279 293
## ENSG00000000003.15 8.065115 7.654697 7.414760 7.414760 7.414760 7.414760
## ENSG00000000419.14 10.210298 9.617510 9.904598 10.577354 10.088775 10.913115
## ENSG00000000457.14 9.139483 9.401286 9.711656 8.671656 9.198039 8.560446
## 303 316 343 416 495 543
## ENSG00000000003.15 7.665355 7.414760 7.701553 7.985013 7.654199 7.799596
## ENSG00000000419.14 10.130760 10.127476 10.263280 9.980862 9.962822 10.406475
## ENSG00000000457.14 9.276793 9.020454 9.631191 9.566639 8.935847 9.143235
## 544 620 660 671 689 697
## ENSG00000000003.15 7.414760 7.414760 7.414760 7.983726 7.414760 7.998331
## ENSG00000000419.14 9.926808 10.058707 10.012602 9.849691 9.827424 9.737310
## ENSG00000000457.14 9.113867 8.993742 9.088332 8.984527 9.670583 9.043966
## 713 745 756 776 783 812
## ENSG00000000003.15 7.414760 7.414760 8.464892 7.612095 7.414760 7.414760
## ENSG00000000419.14 10.568712 10.224058 9.727712 9.502122 9.807414 10.136797
## ENSG00000000457.14 8.833221 9.023978 9.099994 9.174390 8.947940 9.535468
## 821 822 823 837 872 888
## ENSG00000000003.15 8.215109 7.414760 7.414760 7.414760 7.908169 7.668771
## ENSG00000000419.14 10.185967 9.592477 9.634643 10.291304 9.419343 10.278589
## ENSG00000000457.14 9.316280 9.002598 9.363783 9.175015 8.945857 9.673822
## 919 930 963 965 993 1007
## ENSG00000000003.15 7.414760 7.414760 7.414760 8.055938 8.070688 7.789379
## ENSG00000000419.14 10.645181 9.939289 9.856903 9.649153 10.044526 9.763629
## ENSG00000000457.14 8.463859 8.987876 9.055457 9.478553 9.529790 9.742334
## 1049 1061 1068 1122 1159 1217
## ENSG00000000003.15 7.767139 7.906386 7.414760 7.414760 7.414760 8.602357
## ENSG00000000419.14 9.921815 10.334517 9.962073 10.385162 9.982011 9.906338
## ENSG00000000457.14 9.441179 9.372741 9.367584 8.958028 9.390808 9.396863
## 929 1126 1138 1157 1162 1170
## ENSG00000000003.15 8.162077 8.245891 8.523467 7.736537 7.897546 8.083725
## ENSG00000000419.14 9.951198 9.938706 9.817489 9.804952 10.014576 10.257753
## ENSG00000000457.14 9.042888 9.313433 9.171286 9.030463 9.278497 9.266932
## 1207
## ENSG00000000003.15 7.750065
## ENSG00000000419.14 10.022456
## ENSG00000000457.14 9.046908
vst_mat <- as.data.frame(vst@assays@data@listData[[1]]) %>% add_column(GeneSymbol = as.data.frame(rowData(vst))[,'gene_name'], .before = colnames(dds)[1]) %>% add_column(GeneBiotype = as.data.frame(rowData(vst))[,'gene_type'], .before = colnames(dds)[1])
write.csv(vst_mat, file = paste(data_folder, '/RNASeq/Counts/Processed_counts/Grouped/DGE_DESeq2_vst_normalised_countmat.csv', sep = ''), quote = F, row.names = T)
# Generating rlog normalised counts
rlog <- rlogTransformation(dds, blind=FALSE)## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## 55 136 171 246 279 293
## ENSG00000000003.15 3.950361 3.368203 3.221205 3.225189 3.221423 3.229764
## ENSG00000000419.14 9.710953 9.028083 9.369426 10.098703 9.577577 10.436407
## ENSG00000000457.14 8.122866 8.453487 8.815104 7.448380 8.199202 7.275077
## 303 316 343 416 495 543
## ENSG00000000003.15 3.382674 3.224439 3.424512 3.833022 3.369032 3.556126
## ENSG00000000419.14 9.623872 9.620333 9.768216 9.456551 9.436106 9.920738
## ENSG00000000457.14 8.299244 7.962929 8.723668 8.649793 7.845588 8.127792
## 544 620 660 671 689 697
## ENSG00000000003.15 3.223315 3.223584 3.230236 3.832398 3.224449 3.855451
## ENSG00000000419.14 9.395008 9.544094 9.492415 9.305826 9.279837 9.173208
## ENSG00000000457.14 8.089030 7.926600 8.055185 7.912947 8.768610 7.994768
## 713 745 756 776 783 812
## ENSG00000000003.15 3.223922 3.228140 4.542076 3.325656 3.225624 3.224430
## ENSG00000000419.14 10.089914 9.725823 9.161917 8.885234 9.256485 9.630602
## ENSG00000000457.14 7.696005 7.968017 8.070511 8.168570 7.861710 8.613107
## 821 822 823 837 872 888
## ENSG00000000003.15 4.183403 3.225191 3.226805 3.229318 3.717059 3.384739
## ENSG00000000419.14 9.684511 8.997572 9.049730 9.798151 8.779474 9.784720
## ENSG00000000457.14 8.349083 7.938325 8.407593 8.169371 7.858635 8.772361
## 919 930 963 965 993 1007
## ENSG00000000003.15 3.225794 3.221746 3.225663 3.942258 3.958647 3.541928
## ENSG00000000419.14 10.168205 9.409230 9.314316 9.066893 9.528249 9.204553
## ENSG00000000457.14 7.105261 7.917604 8.010824 8.546149 8.606393 8.849341
## 1049 1061 1068 1122 1159 1217
## ENSG00000000003.15 3.511106 3.710542 3.221791 3.225425 3.219552 4.727998
## ENSG00000000419.14 9.389246 9.844532 9.435209 9.898169 9.457843 9.371437
## ENSG00000000457.14 8.501459 8.418749 8.412416 7.876535 8.440902 8.448128
## 929 1126 1138 1157 1162 1170
## ENSG00000000003.15 4.104050 4.228812 4.620055 3.468986 3.701202 3.985704
## ENSG00000000419.14 9.422825 9.408579 9.268253 9.253361 9.494635 9.762253
## ENSG00000000457.14 7.993334 8.345474 8.164499 7.976193 8.301795 8.287062
## 1207
## ENSG00000000003.15 3.487348
## ENSG00000000419.14 9.503502
## ENSG00000000457.14 7.998717
rlog_mat <- as.data.frame(rlog@assays@data@listData[[1]]) %>% add_column(GeneSymbol = as.data.frame(rowData(rlog))[,'gene_name'], .before = colnames(dds)[1]) %>% add_column(GeneBiotype = as.data.frame(rowData(rlog))[,'gene_type'], .before = colnames(dds)[1])
write.csv(rlog_mat, file = paste(data_folder, '/RNASeq/Counts/Processed_counts/Grouped/DGE_DESeq2_rlog_normalised_countmat.csv', sep = ''), quote = F, row.names = T)- Performing PCA analysis
rv <- rowVars(assay(vst))
# select the ntop genes by variance
select <- order(rv, decreasing=TRUE)[seq_len(min(500, length(rv)))]
# perform a PCA on the data in assay(x) for the selected genes
pca <- prcomp(t(assay(vst)[select,]))
pca_samples <- as.data.frame(pca$x)
pca_samples$Condition <- colData(dds)[rownames(pca_samples),'Condition']
pca_genes <- as.data.frame(pca$rotation)
pca_genes <- add_column(pca_genes, .before = 'PC1', GeneSymbol = rowData(dds)[rownames(pca_genes),'gene_name'])
dir.create(path = paste(data_folder, '/RNASeq/DESeq2/PCs', sep = ''), showWarnings = F)
write.csv(pca_samples, file = paste(data_folder, '/RNASeq/DESeq2/PCs/DGE_Samples_PCs_from_rlog_normalised_count_mat.csv', sep = ''), quote = F, row.names = T)
write.csv(pca_genes, file = paste(data_folder, '/RNASeq/DESeq2/PCs/DGE_Genes_PCs_from_rlog_normalised_count_mat.csv', sep = ''), quote = F, row.names = T)
var_explained_df <- data.frame(PC= paste0("PC",1:49),
var_explained=(pca$sdev)^2/sum((pca$sdev)^2))
write.csv(var_explained_df, file = paste(data_folder, '/RNASeq/DESeq2/PCs/DGE_PC_variance_explained.csv', sep = ''), quote = F, row.names = T)
tot_var_explained <- (var_explained_df$var_explained[1] +
var_explained_df$var_explained[2] +
var_explained_df$var_explained[3]) * 100- Plotting 3d PCs via plotly
- By condition
With legend
fig <- plot_ly(pca_samples,
x = ~PC2, y = ~PC1, z = ~PC3, color = ~Condition,
colors = c('BP-CMML' = '#DD3429',
'Control' = '#A4BECE') ) %>% add_markers(size = 12, marker = list (size = 12))
fig <- fig %>% layout(
title = '',
showlegend = TRUE,
legend = list(
title = '',
font = Font_characteristics,
x = 0.46, # Move the legend more to the left
y = 0.0, # Move the legend slightly up
xanchor = 'center',
yanchor = 'bottom'
),
scene = list(
bgcolor = "#ffffff",
camera = list(
eye = list(
x = -1.75,
y = 1.85,
z = 1.04
),
center = list(x = -0.425,
y = 0.35, # Move the figure up
z = 0)
)
)
)
# This requires loading python environment with kaleido installed
save_image(fig, paste(figures_folder, '/Not_Included/DGE_Condition_with_legend.png', sep = ''), width = 2.5 * 300, height = 2.5 * 300, scale = 2)Without legend
fig <- plot_ly(pca_samples,
x = ~PC2, y = ~PC1, z = ~PC3, color = ~Condition,
colors = c('BP-CMML' = '#DD3429',
'Control' = '#A4BECE') ) %>% add_markers(size = 10, marker = list (size = 10))
fig <- fig %>% layout(
title = '',
showlegend = FALSE,
legend = list(
title = '',
font = Font_characteristics,
x = 0.45, # Move the legend more to the left
y = -0.03, # Move the legend slightly up
xanchor = 'center',
yanchor = 'bottom'
),
scene = list(
bgcolor = "#ffffff",
camera = list(
eye = list(
x = -1.75,
y = 1.85,
z = 1.04
),
center = list(x = -0.425,
y = 0.35, # Move the figure up
z = 0)
)
)
)
save_image(fig, paste(figures_folder, '/Not_Included/DGE_Condition_without_legend.png', sep = ''), width = 2.5 * 300, height = 2.5 * 300, scale = 2)- By ICC classification
With legend
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Figure 2C. PCA plot showing samples’ coordinates for top three principal components ranked by variance explained and identified by PCA analysis of VST normalised count data. Samples are coloured by disease state/ICC classification as indicated
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Without legend
pca_samples_ICC <- as.data.frame(pca$x)
pca_samples_ICC$ICC.classification <- colData(dds)[rownames(pca_samples),'ICC.classification']
fig <- plot_ly(pca_samples_ICC,
x = ~PC2, y = ~PC1, z = ~PC3, color = ~ICC.classification,
colors = c('BPDCN' = '#268418',
'AML with MDS-related cytogenetic abnormalities' = '#B533B8',
'AML with MDS-related gene mutations' = '#FB84A2',
'AML with NPM1' = '#1754B8',
'AML with CEBPA' = '#FFCB7C',
'AML not otherwise specified' = '#8E3318',
'Control' = '#A4BECE'),
marker = list(size = 12))
fig <- fig %>% layout(
title = '',
showlegend = FALSE,
legend = list(
title = '',
font = Font_characteristics,
x = 0.49, # Move the legend more to the left
y = -0.03, # Move the legend slightly up
xanchor = 'center',
yanchor = 'bottom'
),
scene = list(
bgcolor = "#ffffff",
camera = list(
eye = list(
x = -1.75,
y = 1.85,
z = 1.04
),
center = list(x = -0.425,
y = 0.35, # Move the figure up
z = 0)
)
)
)
save_image(fig, paste(figures_folder, '/Not_Included/DGE_ICC_without_legend.png', sep = ''), width = 3 * 300, height = 3 * 300, scale = 2)## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
- By WHO classification
With legend
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Figure S2. PCA plot showing samples’ coordinates for top three principal components ranked by variance explained and identified by PCA analysis of VST normalised count data. Samples are coloured by disease state/WHO classification as indicated
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
With legend
pca_samples_WHO <- as.data.frame(pca$x)
pca_samples_WHO$WHO.classification <- colData(dds)[rownames(pca_samples),'WHO.classification']
fig <- plot_ly(pca_samples_WHO,
x = ~PC2, y = ~PC1, z = ~PC3, color = ~WHO.classification,
colors = c('BPDCN' = '#268418',
'AML myelodysplasia-related' = '#FB84A2',
'AML with NPM1' = '#1754B8',
'AML with CEBPA' = '#FFCB7C',
'AML with other defined genetic alterations' = '#8E3318',
'Control' = '#A4BECE'),
marker = list(size = 12))
fig <- fig %>% layout(
title = '',
showlegend = FALSE,
legend = list(
title = '',
font = Font_characteristics,
x = 0.46, # Move the legend more to the left
y = 0.0, # Move the legend slightly up
xanchor = 'center',
yanchor = 'bottom'
),
scene = list(
bgcolor = "#ffffff",
camera = list(
eye = list(
x = -1.75,
y = 1.85,
z = 1.04
),
center = list(x = -0.425,
y = 0.35, # Move the figure up
z = 0)
)
)
)
save_image(fig, paste(figures_folder, '/Not_Included/DGE_WHO_with_legend.png', sep = ''), width = 2.5 * 300, height = 2.5 * 300, scale = 2)## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
- DEGs
TF_list <- read.csv(paste(data_folder, '/RNASeq/Resources/Gene_Metadata/Lambert2018_TFs.txt', sep = ''), sep = '\t', header = F)
HGNC_CD_markers_list <- read.csv(paste(data_folder, '/RNASeq/Resources/Gene_Metadata/HGNC_CD_Markers.csv', sep = ''))- BP-CMML vs Control
- Evaluating number of significant genes
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the line search routine failed, possibly due to insufficient
## numeric precision
## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the line search routine failed, possibly due to insufficient
## numeric precision
## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the line search routine failed, possibly due to insufficient
## numeric precision
## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the line search routine failed, possibly due to insufficient
## numeric precision
## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the line search routine failed, possibly due to insufficient
## numeric precision
## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the moving direction increases the objective function value
- Performing comparison
dir.create(path = paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons', sep = ''),
showWarnings = F)
dir.create(paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML', sep = ''),
showWarnings = F)
resAsh_df <- as.data.frame(resAsh) %>% add_column(GeneSymbol =rowData(dds)[rownames(resAsh),'gene_name'], .before = 'baseMean') %>% add_column(GeneBiotype =rowData(dds)[rownames(resAsh),'gene_type'], .before = 'baseMean')
resAsh_df$Is_TF <- 'No'
resAsh_df[grep(TRUE, resAsh_df$GeneSymbol %in% TF_list$V1),'Is_TF'] <- 'Yes'
resAsh_df$Is_CD_Marker <- 'No'
resAsh_df[grep(TRUE, resAsh_df$GeneSymbol %in% HGNC_CD_markers_list$Approved.symbol),'Is_CD_Marker'] <- 'Yes'
res_significant <- resAsh_df %>% arrange(padj)
write.csv(res_significant,
file = paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_ALL_DEGs_LFC_Ashr.csv', sep = ''),
row.names = T, quote = FALSE)
res_significant <- resAsh_df %>% dplyr::filter(padj < 0.05) %>% dplyr::arrange(padj)
write.csv(res_significant,
file = paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_padj_significant_DEGs_LFC_Ashr.csv', sep = ''),
row.names = T, quote = FALSE)
res_significant_UP <- res_significant %>% dplyr::filter(log2FoldChange >= 0) %>% dplyr::arrange(padj)
write.csv(res_significant_UP,
file = paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_upregulated_padj_significant_DEGs_LFC_Ashr.csv', sep = ''),
row.names = T, quote = FALSE)
res_significant_DOWN <- res_significant %>% dplyr::filter(log2FoldChange < 0) %>% dplyr::arrange(padj)
write.csv(res_significant_DOWN,
file = paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_downregulated_padj_significant_DEGs_LFC_Ashr.csv', sep = ''),
row.names = T, quote = FALSE)5) Vulcano plot
DESeq2_results <- read.csv(paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_ALL_DEGs_LFC_Ashr.csv', sep = ''))Figure 2D. Volcano plot of differentially expressed genes comparing BP- CMML to healthy age-matched control HSPCs.
6) Heatmap uniquely up genes
UP_in_BP_CMML <- read.csv(paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_upregulated_padj_significant_DEGs_LFC_Ashr.csv', sep = ''), row.names = 'X')
UP_in_BP_CMML <- UP_in_BP_CMML %>% filter(GeneBiotype == 'protein_coding')
BP_CMML_genes <- (UP_in_BP_CMML %>% arrange(padj))$GeneSymbol[1:30]
# - - - - - - - - - - -
UP_in_healthy <- read.csv(paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_downregulated_padj_significant_DEGs_LFC_Ashr.csv', sep = ''), row.names = 'X')
UP_in_healthy <- UP_in_healthy %>% filter(GeneBiotype == 'protein_coding')
Healthy_genes <- ((UP_in_healthy %>% arrange(padj))$GeneSymbol[1:30])
genes_to_keep <- c(BP_CMML_genes, Healthy_genes)Figure 2E. Heatmap showing the top 30 up- and down-regulated DEGs per condition, ranked by p.adj value.
7) Calculating condition transcriptional heterogeneity
dir.create(path = paste(data_folder, '/RNASeq/DESeq2/Other', sep = ''),
showWarnings = F)
dir.create(path = paste(data_folder, '/RNASeq/DESeq2/Other/Gene_Dispersion', sep = ''),
showWarnings = F)
dir.create(path = paste(data_folder, '/RNASeq/DESeq2/Other/Gene_Dispersion/Control_vs_BP-CMML', sep = ''),
showWarnings = F)
Control_samples <- (Samples_metadata %>% mutate(Sample.ID = rownames(Samples_metadata)) %>% filter(Condition == 'Control'))$Sample.ID
BPCMML_samples <- (Samples_metadata %>% mutate(Sample.ID = rownames(Samples_metadata)) %>% filter(Condition == 'BP-CMML'))$Sample.ID
TXI.Genes_Control <- TXI.Genes
TXI.Genes_Control$abundance <- TXI.Genes_Control$abundance[,Control_samples]
TXI.Genes_Control$counts <- TXI.Genes_Control$counts[,Control_samples]
TXI.Genes_Control$length <- TXI.Genes_Control$length[,Control_samples]
dds_control = DESeqDataSetFromTximport(txi = TXI.Genes_Control,
colData = Samples_metadata[Control_samples,],
rowData= Genes_annotation_metadata[rownames(TXI.Genes_Control$counts),],
design = ~1)## using counts and average transcript lengths from tximport
## using 'avgTxLength' from assays(dds), correcting for library size
## Warning in DESeq(dds_control, fitType = "local"): the design is ~ 1 (just an
## intercept). is this intended?
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 995 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
TXI.Genes_BPCMML <- TXI.Genes
TXI.Genes_BPCMML$abundance <- TXI.Genes_BPCMML$abundance[,BPCMML_samples]
TXI.Genes_BPCMML$counts <- TXI.Genes_BPCMML$counts[,BPCMML_samples]
TXI.Genes_BPCMML$length <- TXI.Genes_BPCMML$length[,BPCMML_samples]
dds_BPCMML = DESeqDataSetFromTximport(txi = TXI.Genes_BPCMML,
colData = Samples_metadata[BPCMML_samples,],
rowData= Genes_annotation_metadata[rownames(TXI.Genes_BPCMML$counts),],
design = ~1)## using counts and average transcript lengths from tximport
## using 'avgTxLength' from assays(dds), correcting for library size
## Warning in DESeq(dds_BPCMML, fitType = "local"): the design is ~ 1 (just an
## intercept). is this intended?
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 4429 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
df <- data.frame(row.names = rownames(rowData(dds)))
df$Disp_Control <- rowData(dds_control)[,'dispersion']
df$Disp_BPCMML <- rowData(dds_BPCMML)[,'dispersion']
write.csv(df,
file = paste(data_folder, '/RNASeq/DESeq2/Other/Gene_Dispersion/Control_vs_BP-CMML/Gene_Dispersion_per_condition.csv', sep = ''),
row.names = T)